######################
##### PREPARATION ####
######################
rm(list=setdiff(ls(), "path_to"))

# Load data
s = read_sav(path_to("consumers_raw"))
benchmark = read.csv2(path_to("demographics_benchmark"), as.is=T)

# Drop respondents: illegal or didn't reach main part
s = s[!s$PROLIFIC_PID %in% c("", NA, "6136f25a4906e4c2da32bc2b"),]
s = s[!duplicated(s$PROLIFIC_PID), ]
s = s[!is.na(s$education), ]

# Drop columns
drop = c(
  colnames(s)[grepl("time", colnames(s))],
  colnames(s)[grepl("attention_", colnames(s))],
  colnames(s)[grepl("meta", colnames(s))],
  setdiff(colnames(s)[grepl("^vig_", colnames(s))], "vig_reduction"),
  colnames(s)[grepl("^conseq_", colnames(s))],
  "consent", "STUDY_ID", "SESSION_ID", "b", "hidePrev"
)
s[, drop] = NULL


##############################################################
########## DEMOGRAPHICS ######################################
##############################################################
# Education
s$education_college = 1*(s$education %in% c(5,6))
s$education_nocollege = 1*(!s$education %in% c(5,6))

# Income
s$income = plyr::mapvalues(
  s$income,
  from = c(1:8),
  to = c(7500, 20000, 37500, 62500, 87500, 125000, 175000, 225000)
)
s$income_log = log(s$income)
s$income_below50 = 1*(s$income<50000)
s$income_50_100 =  1*(s$income>50000 & s$income<=100000)
s$income_100plus = 1*(s$income>100000)

# Region
states = read.csv2(path_to("states_encoding"), as.is=T)
s$region = as.character(plyr::mapvalues(as.numeric(s$state), from = states$code, to = states$region))
s$region_midwest = s$region == "midwest"
s$region_northeast = s$region == "northeast"
s$region_south = s$region == "south"
s$region_west = s$region == "west"

# Gender
s$gender_female = 1*(s$gender %in% 2)
s$gender_other = 1*(!s$gender %in% 2)

# Age
s$age_below35 = 1*(s$age < 35)
s$age_35_54 = 1*(s$age >= 35 & s$age < 55)
s$age_55plus = 1*(s$age >= 55)

# Politics
s$politics_democrat = 1*(s$politics == 2)
s$politics_independent = 1*(s$politics == 3)
s$politics_republican = 1*(s$politics == 1)

# Ethnicity
races = c("white", "black", "hispanic", "indigenous", "asian")
for (r in 1:length(races)) {
  s[, paste0("race", races[r], "_yes")] = 1*(!is.na(s[, paste0("race_", r)]))
  s[, paste0("race", races[r], "_no")] = 1*(is.na(s[, paste0("race_", r)]))
}
s$race_indigenous = NULL


##############################################################
########## DROP INCOMPLETE ###################################
##############################################################
# Create separate attrition data
attrition = s

# Drop respondents who did not complete the survey
# Drop extreme response duration
s = s[!is.na(s$politics), ]
s = s[s$Duration__in_seconds_ %>% between(quantile(., c(0.01)), quantile(., c(0.99), inbounds=F)), ]

# Store data for attrition analysis
attrition$completed_belief = 1*!is.na(attrition$effect)
attrition$completed_conseq = 1*!is.na(attrition$wtp_1)
attrition$completed_survey = 1*!is.na(attrition$politics)
attrition$final_sample = 1*(attrition$ResponseId %in% s$ResponseId)
saveRDS(attrition, path_to("attrition"))


##############################################################
########## CLEAN QUANT DATA ##################################
##############################################################
## BELIEFS
# Indicator for dampening
s$belief_dampening = 1*(s$effect %in% c(3, 4))

# Quant beliefs (change variables once label error is corrected)
s$vig_reduction = gsub("[^0-9]", "", s$vig_reduction) %>% as.numeric
s$quant_effect = 1 / s$vig_reduction * ifelse(s$effect == 1, -1 * s$quant_dec_more * s$vig_reduction,
                                       ifelse(s$effect == 2, s$vig_reduction %>% as.numeric,
                                       ifelse(s$effect == 3, -1 * s$quant_dec_less * s$vig_reduction,
                                       ifelse(s$effect == 4, 0,
                                       ifelse(s$effect == 5, -1 * s$quant_inc * s$vig_reduction, NA)))))

## VALUATIONS
# WTP Ratio and Differences
s$wtp_ratio = ifelse(s$wtp_1 == 0, NA, s$wtp_2 / s$wtp_1) %>% oob_squish(c(0, 1))
s$wtp_ratio_class = cut(
  s$wtp_ratio, breaks=c(-0.0001, 0.0001, 0.25, 0.5, 0.75, 0.9999, 1.0001),
  labels=c("0", "(0,0.25]", "(0.25,0.5]", "(0.5,0.75]", "(0.75,1)", "1")
)

# Censoring extreme values
for (v in unique(s$conseq)) {
  s[s$conseq == v, "wtp_1_censored"] = s[s$conseq == v, "wtp_1", drop=T] %>%
    oob_squish(quantile(., c(0, 0.95)))
  s[s$conseq == v, "wtp_2_censored"] = s[s$conseq == v, "wtp_2", drop=T] %>%
    oob_squish(quantile(., c(0, 0.95)))
}

# Indicators
s$conseq_strict = 1*(s$wtp_ratio_class %in% 0)
s$conseq_weak = 1*(!s$wtp_ratio_class %in% c(1, NA))
s$conseq_deont = 1*(s$wtp_ratio_class %in% 1)
s$conseq_dontcare = 1*(s$wtp_1 == 0)


##############################################################
########## CONSUMPTION DATA ##################################
##############################################################
# Relevant consumers for respective belief case
s$consumer_vig = 1*ifelse(s$vig == "meat", s$meat >= 1,
                   ifelse(s$vig == "fuel", s$miles >= 5000,
                   ifelse(s$vig == "flights", s$flights >= 1,
                   ifelse(s$vig == "energy", s$energybill >= 500,
                   ifelse(s$vig == "subelectricity", s$electricity >= 500,
                   ifelse(s$vig == "subhouse", s$energybill >= 500,
                   ifelse(s$vig == "subcoffee", s$coffee >= 1 & s$coffee_fairtrade_1 >= 50,
                   ifelse(s$vig == "subclothing", s$clothing_secondhand_1 >= 50, NA))))))))

# Relevant consumer for respective valuation case
s$consumer_conseq = 1*ifelse(s$conseq == "co2", T,
                      ifelse(s$conseq == "chicken", s$meat>=1 & s$meat_organic_1>=50,
                      ifelse(s$conseq == "textile", s$clothing_fairtrade_1 >= 50,
                      ifelse(s$conseq == "waste", T, NA))))


##############################################################
########## WEIGHTS ###########################################
##############################################################
# Define target levels and weighting variables
target = benchmark$freq
names(target) = benchmark$variable
cats = unique(gsub("_.*", "", names(target)))
target_ls = list()
for (c in cats) {
  vars = names(target)[grepl(c, names(target))]
  target_ls[[paste0(c, "_wgt")]] = target[vars]
  s[, paste0(c, "_wgt")] = factor(sapply(1:nrow(s), function(i) {
    s[i, vars] %>% as.logical %>% which %>% vars[.]
  }))
}
  
# Rake!
rake = suppressWarnings(anesrake(
  inputter = target_ls, dataframe = as.data.frame(s), caseid = s$ResponseId,
  weightvec = rep(1, nrow(s)), cap = 5, verbose = F, type = "nolim"
))
#summary(rake)
#hist(rake$weightvec)
#quantile(rake$weightvec, c(0, 0.01, 0.05, 0.1, 0.2, 0.8, 0.9, 0.95, 0.99, 1))

# Add to data
s$wgt = rake$weightvec[s$ResponseId]


##############################################################
########## SAVE DATA #########################################
##############################################################
saveRDS(s, path_to("wide"))

